In [1]:
import pandas as pd
from scipy import stats
from IPython.core.interactiveshell import InteractiveShell
import hvplot.pandas

pd.options.plotting.backend = "holoviews"


InteractiveShell.ast_node_interactivity = "all"
import numpy as np
import panel as pn
import scipy
from panel.template import DefaultTheme

pn.extension()
In [2]:
from plotnine import ggplot, aes, geom_histogram
cred = {"host": 'localhost', 'dbname': 'yukontaf', 'user': 'glebsokolov', 'password': ''}
from sqlalchemy import create_engine

con = create_engine(
    f'postgresql://{cred["user"]}:{cred["password"]}@{cred["host"]}/{cred["dbname"]}'
)


def select(sql):
    return pd.read_sql(sql, con)
In [3]:
sql = '''select * from noshowappointment'''
df = select(sql)
df.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 110527 entries, 0 to 110526
Data columns (total 15 columns):
 #   Column          Non-Null Count   Dtype  
---  ------          --------------   -----  
 0   index           110527 non-null  int64  
 1   Unnamed: 0      110527 non-null  int64  
 2   PatientId       110527 non-null  float64
 3   Gender          110527 non-null  float64
 4   ScheduledDay    110527 non-null  object 
 5   AppointmentDay  110527 non-null  object 
 6   Age             110527 non-null  int64  
 7   Neighbourhood   110527 non-null  float64
 8   Scholarship     110527 non-null  int64  
 9   Hypertension    110527 non-null  int64  
 10  Diabetes        110527 non-null  int64  
 11  Alcoholism      110527 non-null  int64  
 12  Handcap         110527 non-null  int64  
 13  SMS_received    110527 non-null  int64  
 14  No-show         110527 non-null  float64
dtypes: float64(4), int64(9), object(2)
memory usage: 12.6+ MB
In [4]:
df.head().hvplot.table(width=1480, height=240)
Out[4]:

First, let's preprocess the data: make the target variable boolean, make ordinal encoding for the Neighborhood and Gender column, cast columns, containing dates to datetime type, drop AppointmentID (as we wont need it), and I will also calculate a new feature - awaiting time. Then, let's also check the data for outliers and ridiculous records (i.e. negative age and so on)

In [5]:
from sklearn.preprocessing import OrdinalEncoder

del df["Unnamed: 0"]
df[["ScheduledDay", "AppointmentDay"]] = df[["ScheduledDay", "AppointmentDay"]].apply(
    lambda x: pd.to_datetime(x)
)
enc = OrdinalEncoder()
df[["Gender", "Neighbourhood", "No-show"]] = enc.fit_transform(
    df[["Gender", "Neighbourhood", "No-show"]]
)
df["AwaitingTime"] = -(df["ScheduledDay"] - df["AppointmentDay"]).dt.days
text2 = """After preprocessing our"""
tab2 = df.head().hvplot.table(width=820 * 2, height=420 * 2)
In [6]:
df.query("Age==-1 or AwaitingTime<0")
df.sort_values(by="Age", ascending=True)["Age"].unique()[:10]
df = df.drop(index=df.query("Age==-1 or AwaitingTime<0").index)
Out[6]:
See Full Dataframe in Mito
index PatientId Gender ScheduledDay AppointmentDay Age Neighbourhood Scholarship Hypertension Diabetes Alcoholism Handcap SMS_received No-show AwaitingTime
27033 27033 7.839273e+12 1.0 2016-05-10 10:51:53+00:00 2016-05-09 00:00:00+00:00 38 59.0 0 0 0 0 1 0 1.0 -1
55226 55226 7.896294e+12 0.0 2016-05-18 14:50:41+00:00 2016-05-17 00:00:00+00:00 19 69.0 0 0 0 0 1 0 1.0 -1
64175 64175 2.425226e+13 0.0 2016-05-05 13:43:58+00:00 2016-05-04 00:00:00+00:00 22 13.0 0 0 0 0 0 0 1.0 -1
71533 71533 9.982316e+14 0.0 2016-05-11 13:49:20+00:00 2016-05-05 00:00:00+00:00 81 69.0 0 0 0 0 0 0 1.0 -6
72362 72362 3.787482e+12 1.0 2016-05-04 06:50:57+00:00 2016-05-03 00:00:00+00:00 7 78.0 0 0 0 0 0 0 1.0 -1
99832 99832 4.659432e+14 0.0 2016-06-06 08:58:13+00:00 2016-06-06 00:00:00+00:00 -1 60.0 0 0 0 0 0 0 0.0 0
Out[6]:
array([-1,  0,  1,  2,  3,  4,  5,  6,  7,  8])

As we can see here, we have several recordings with negative awaiting time, we delete them. We also see, that we have a recording with negative age.

Now and further 1 will encode a group who didn't come, and 0 who come.

Now let's view at the distributions of the categorical variables, in order to do so, let's build bar plots, and histogram for age, one more bar plot will include all the neighborhoods.

In [7]:
show_opts = [
    1.0,
    0.0,
]
show_opt = pn.widgets.Select(name="No-show", options=show_opts)
idf = df.interactive()
data_pipeline = idf[idf["No-show"] == show_opt]
In [11]:
text01 = '''First, let's have a look at the distributions of ages and neighbourhoods'''
viz1 = df["Age"].hvplot(kind="hist", title='Histogram for Ages')
viz1
Out[11]:
In [12]:
group = df.groupby(["Neighbourhood", "No-show"]).count()["PatientId"]
viz2 = group.plot.bar(
    stacked=True, title="Neighbourhood", rot=90, width=1024
) + group.hvplot.table(width=360)
viz2
Out[12]:

Now, lets write a function that will calculate confidence interval for the difference of the two proportions (for the show and not-show groups). Then, write a function that will draw a conclusion given the confidence interval.

In [13]:
def proportions_confint_diff(sample1, sample2, alpha=0.05):
    z = scipy.stats.norm.ppf(1 - alpha / 2.0)
    p1 = float(sum(sample1)) / len(sample1)
    p2 = float(sum(sample2)) / len(sample2)

    left_boundary = (p1 - p2) - z * np.sqrt(
        p1 * (1 - p1) / len(sample1) + p2 * (1 - p2) / len(sample2)
    )
    right_boundary = (p1 - p2) + z * np.sqrt(
        p1 * (1 - p1) / len(sample1) + p2 * (1 - p2) / len(sample2)
    )
    return (round(left_boundary*100, 3), round(right_boundary*100, 3))


def calculate_ci(group, alpha):
    boundaries = proportions_confint_diff(
        df[df[group] == 1]["No-show"], df[df[group] == 0]["No-show"], alpha=alpha
    )
    if boundaries[0] < 0 and boundaries[1] < 0:
        return f"CI is {boundaries}, proportion in the first ({group}=0) group tends to be less"
    elif boundaries[0] > 0 and boundaries[1] > 0:
        return (
            f"CI is {boundaries}, proportion in the first group ({group}=0) tends to be greater"
        )
    else:
        return f"CI is {boundaries}, can not make definitive decision"
In [26]:
from scipy.stats import bootstrap

The function below will draw a bar plot for each categorical variable with a table, where a prior probability for show and no-show situations for each subcategory will be calculated

In [15]:
def group_draw(by):
    group = (
        df.groupby([by, "No-show"])
        .count()[["PatientId"]]
        .rename(columns={"PatientId": "Count"})
    )
    ind = group.index
    group = group.reset_index()
    group["ShowProb"] = pd.concat(
        [
            group[group[by] == 0].transform(lambda x: x / sum(x))["Count"],
            group[group[by] == 1].transform(lambda x: x / sum(x))["Count"],
        ]
    )
    group = group[["Count", "ShowProb"]].set_index(ind)
    return group.plot.bar(stacked=True, title=by) + group.hvplot.table(
        width=360
    )

Now lets's have a look at the barplots for each category combined with the tables where the probability of (no)-show is calculated

In [16]:
viz3 = group_draw("Scholarship")
text3 = calculate_ci("Scholarship", 0.05)
viz3; text3
Out[16]:
Out[16]:
'CI is (3.095, 4.771), proportion in the first group (Scholarship=0) tends to be greater'
In [17]:
viz4 = group_draw("Gender")
text4 = calculate_ci("Gender", 0.05)
viz4; text4
Out[17]:
Out[17]:
'CI is (-0.843, 0.147), can not make definitive decision'
In [18]:
viz5 = group_draw("Hypertension")
text5 = calculate_ci("Hypertension", 0.05)
viz5; text5
Out[18]:
Out[18]:
'CI is (-4.166, -3.029), proportion in the first (Hypertension=0) group tends to be less'
In [19]:
viz6 = group_draw("Diabetes")
text6 = calculate_ci("Diabetes", 0.05)
viz6; text6
Out[19]:
Out[19]:
'CI is (-3.236, -1.476), proportion in the first (Diabetes=0) group tends to be less'
In [20]:
viz7 = group_draw("Alcoholism")
text7 = calculate_ci("Alcoholism", 0.05)
viz7; text7
Out[20]:
Out[20]:
'CI is (-1.42, 1.335), can not make definitive decision'
In [21]:
viz8 = group_draw("SMS_received")
text8 = calculate_ci("SMS_received", 0.05)
viz8; text8
Out[21]:
Out[21]:
'CI is (10.34, 11.413), proportion in the first group (SMS_received=0) tends to be greater'

Now, let's calculate probabilitites of no-show for each age, neighborhood, scheduled hour and awaiting time and see if there are any patterns

In [22]:
probs_age = pd.crosstab(index=df["Age"], columns=df["No-show"])
probs_age["prob_show"] = probs_age.loc[:, 1] / (
    probs_age.loc[:, 0] + probs_age.loc[:, 1]
)
viz9 = probs_age.hvplot.scatter(x="Age", y="prob_show")
viz9
Out[22]:
In [23]:
probs_place = pd.crosstab(index=df["Neighbourhood"], columns=df["No-show"])
probs_place["prob_show"] = probs_place.loc[:, 1] / (
    probs_place.loc[:, 0] + probs_place.loc[:, 1]
)
viz10 = probs_place.hvplot.scatter(x="Neighbourhood", y="prob_show")
viz10
Out[23]:

As we can see here age's and neighborhood's probabilities are distributed almost uniformly (except for a single peculiar points for each graph)

In [24]:
probs_hour = (
    pd.crosstab(index=df["ScheduledDay"].dt.hour, columns=df["No-show"])
    .reset_index()
    .rename(columns={"ScheduledDay": "ScheduledHour"})
)
probs_hour["prob_hour"] = probs_hour.loc[:, 1] / (
    probs_hour.loc[:, 0] + probs_hour.loc[:, 1]
)
viz11 = probs_hour.hvplot.scatter(x="ScheduledHour", y="prob_hour")
viz11
Out[24]:
In [25]:
probs_await = pd.crosstab(index=df["AwaitingTime"], columns=df["No-show"])
probs_await["prob_await"] = probs_await.loc[:, 1] / (
    probs_await.loc[:, 0] + probs_await.loc[:, 1]
)
viz12 = probs_await.hvplot.scatter(x="AwaitingTime", y="prob_await")
viz12
Out[25]:
In [26]:
# import matplotlib.pyplot as plt
from sklearn.preprocessing import OneHotEncoder

encoder = OneHotEncoder()
X, y = (
    encoder.fit_transform(
        df.drop(["ScheduledDay", "AppointmentDay", "PatientId", "No-show"], axis=1)
    ),
    df["No-show"],
)

Now let's train and compair a pair of classifiers that will try to predict our target variable.

In [27]:
from sklearn.metrics import accuracy_score
from sklearn.naive_bayes import BernoulliNB, MultinomialNB
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    df.drop(["ScheduledDay", "AppointmentDay", "PatientId", "No-show"], axis=1),
    df["No-show"],
    test_size=0.2,
)
mclf, bclf = MultinomialNB(), BernoulliNB()
mclf.fit(X_train, y_train)
bclf.fit(X_test, y_test)
print("Accuracy:", round(accuracy_score(y_test, mclf.predict(X_test)), 2) * 100, "%")
print("Accuracy:", round(accuracy_score(y_test, bclf.predict(X_test)), 2) * 100, "%")
Out[27]:
MultinomialNB()
Out[27]:
BernoulliNB()
Accuracy: 70.0 %
Accuracy: 80.0 %

Such a result is excpectable. Bernoulli classifier is well adopted for situations like we have here: a variety of binary features.

In [28]:
df_pane = pn.pane.DataFrame(df.head(), width=1400)
In [496]:
template = pn.template.FastListTemplate(
    theme=DefaultTheme,
    title="No Show Appointment Interactive Dashboard and Analysis",
    sidebar=[
        pn.pane.Markdown("# About the project"),
    ],
    main=[
        pn.Row(pn.Column(pn.pane.Markdown(text1), df_pane)),
        pn.Row(pn.Column(text01, viz1, viz2)),
        pn.Row(
            pn.Column(
                pn.pane.Markdown(code_ci),
                text03,
                viz3,
                text3,
                viz4,
                text4,
                viz5,
                text5,
                viz6,
                text6,
                viz7,
                text7,
                viz8,
                text8,
            )
        ),
    ],
)
template.servable();
In [497]:
template.show()
Launching server at http://localhost:60989
Out[497]:
<bokeh.server.server.Server at 0x7fadbb7d1e10>
WARNING:tornado.access:404 GET /static/extensions/panel/bundled/fastbasetemplate/images/header-columns-over-bg.gif (::1) 0.76ms
In [ ]: